/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

static const char __idstring[] = "@(#)$Id: mx.c,v 1.131 2006/12/12 19:16:39 patrick Exp $";

#include "mx_arch.h"
#include "mx_misc.h"
#include "mx_instance.h"
#include "mx_malloc.h"
#include "mx_mcp_interface.h"
#include "mx_peer.h"
#include <dev/pci/pcivar.h>
#include <dev/pci/pcireg.h>

#include <sys/filedesc.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/vnode.h>
#include <sys/linker.h>
#include <sys/memrange.h>
#include <sys/signalvar.h>
#include <sys/sched.h>

#if MX_KERNEL_LIB
int mx_init_klib(void);
void mx_finalize_klib(void);
#endif

MODULE_DEPEND(mx, mx_mcp, 1, 1, 1);

extern int mx_initialized;
mx_atomic_t mx_max_user_pinned_pages;
int mx_use_count = 0;

static int mx_probe_dev(device_t dev);
static int mx_attach(device_t dev);
static int mx_detach(device_t dev);
static int mx_shutdown(device_t dev);
static void mx_intr(void *arg);

static device_method_t mx_methods[] =
{
  /* Device interface */
  DEVMETHOD(device_probe, mx_probe_dev),
  DEVMETHOD(device_attach, mx_attach),
  DEVMETHOD(device_detach, mx_detach),
  DEVMETHOD(device_shutdown, mx_shutdown),
  
  {0, 0}
};

struct mtx mx_print_mtx;

static driver_t mx_driver =
{
  "mx",
  mx_methods,
  sizeof(mx_instance_state_t),
};

static devclass_t mx_devclass;

/* Declare ourselves to be a child of the PCI bus.*/
DRIVER_MODULE(mx, pci, mx_driver, mx_devclass, 0, 0);
MODULE_VERSION(mx, 1);

static d_open_t mx_open;
static d_close_t mx_close;
static d_ioctl_t mx_ioctl;
static d_mmap_t mx_mmap;
static d_read_t mx_read;

static struct cdevsw mx_cdevsw =
{
  .d_version = D_VERSION,
  .d_open =  mx_open,
  .d_close = mx_close,
  .d_ioctl = mx_ioctl,
  .d_mmap =  mx_mmap,
  .d_read = mx_read,
  .d_name =  "mx"
};

static struct clonedevs *mx_clones;
static eventhandler_tag event_tag;
#if MX_FREEBSD_CLONE_TAKES_UCRED
static void mx_clone(void *arg, struct ucred *uc, char *name, int namelen, struct cdev **cdev);
#else
static void mx_clone(void *arg, char *name, int namelen, struct cdev **cdev);
#endif
#if __FreeBSD_version < 502108
#define clone_setup(x)
#endif

/*
 * Return identification string if this is device is ours.
 */
static int
mx_probe_dev(device_t dev)
{
  if ((pci_get_vendor(dev) == MX_PCI_VENDOR_MYRICOM) &&
      (pci_get_device(dev) == MX_PCI_DEVICE_MYRINET) &&
      (pci_get_revid(dev) >= 4)) {
    switch (pci_get_revid(dev)) {
    case 4:
      device_set_desc(dev, "Myrinet PCIXD");
      break;
    case 5:
      device_set_desc(dev, "Myrinet PCIXE");
      break;
    case 6:
      device_set_desc(dev, "Myrinet PCIXF");
      break;
    default:
      device_set_desc(dev, "Myrinet PCIX???");
      break;
    }
    return 0;
  }
  if ((pci_get_vendor(dev) == MX_PCI_VENDOR_MYRICOM) &&
      ((pci_get_device(dev) == MX_PCI_DEVICE_Z4E ||
	pci_get_device(dev) == MX_PCI_DEVICE_Z8E))) {
    switch (pci_get_device(dev)) {
    case MX_PCI_DEVICE_Z4E:
      device_set_desc(dev, "Myrinet PCIe Z4E");
      break;
    case MX_PCI_DEVICE_Z8E:
      device_set_desc(dev, "Myrinet PCIe Z8E");
      break;
    default:
      break;
    }
    return 0;
  }
  return ENXIO;
}

static void
mx_set_writecombine(vm_offset_t pa, vm_offset_t len)
{
#if MX_CPU_x86
  struct mem_range_desc mrdesc;
  int status, action;

  mrdesc.mr_base = pa;
  mrdesc.mr_len = len;
  mrdesc.mr_flags = MDF_WRITECOMBINE;
  action = MEMRANGE_SET_UPDATE;
  strcpy((char *)&mrdesc.mr_owner, "mx");
  status = mem_range_attr_set(&mrdesc, &action);
  if (status && mx_debug_mask) {
    MX_WARN(("failed to set MDF_WRITECOMBINE for pa 0x%lx, len 0x%lx, err = %d\n",
	     (unsigned long)pa, (unsigned long)len, status));
  }
#endif
}

static int mx_watchdog_thread_should_exit = 0;
static pid_t mx_watchdog_pid;

static void
mx_watchdog_thread(void *dontcare)
{
  do {
    mx_watchdog_body();
    tsleep(&mx_watchdog_thread_should_exit, PZERO, "mx_watchog",
	   MX_WATCHDOG_TIMEOUT * hz);
  } while (!mx_watchdog_thread_should_exit);
  kthread_exit(0);
}

static void
mx_watchdog_stop(void)
{
  struct proc *p;

  if (mx_watchdog_pid != -1) {
    /* find the watchdog thread and kill it */
      
    mx_watchdog_thread_should_exit = 1;
    wakeup(&mx_watchdog_thread_should_exit);
    
    do {
      tsleep((void *)&mx_watchdog_pid, PZERO, "mx_watchog_exit", hz/4);
      p = pfind(mx_watchdog_pid);
      if (p != NULL) {
	PROC_UNLOCK(p);
      }
    } while (p != NULL);
    mx_watchdog_pid = -1;
  }
}


/*
 * The device attach function.  This function returns 0 on success,
 * nonzero on failure. Note that error return codes are not propagated
 * back to the user via kldload, so carefully translating them is
 * mostly useless -- a printf is much more useful.
 */

static int
mx_attach(device_t dev)
{
  int error = ENXIO;
  mx_instance_state_t *is = device_get_softc(dev);
  int unit = device_get_unit(dev);
  int initialized_mx = 0;
  int rid;
  int status;
  struct proc *watchdog_proc;

  if (mx_num_instances >= mx_max_instance) {
    status = ENXIO;
    goto abort_with_nothing;
  }

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("mx: attach, is = %p, size %d\n", is, sizeof(*is)));
  /*  Debugger("in attach");*/
  is->arch.dev = dev;
  if (!mx_initialized) {
#if MX_DEBUG
    TUNABLE_INT_FETCH("mx.debug_mask", &mx_debug_mask);
#endif    
    TUNABLE_INT_FETCH("mx.max_instance", &mx_max_instance);
    TUNABLE_INT_FETCH("mx.small_message_threshold", 
		      &mx_small_message_threshold);
    TUNABLE_INT_FETCH("mx.medium_message_threshold", 
		      &mx_medium_message_threshold);
    TUNABLE_INT_FETCH("mx.security_disabled", &mx_security_disabled);
    TUNABLE_INT_FETCH("mx.max_nodes", &mx_max_nodes);
    TUNABLE_INT_FETCH("mx.max_endpoints", &mx_max_endpoints);
    TUNABLE_INT_FETCH("mx.max_send_handles", &mx_max_send_handles);
    TUNABLE_INT_FETCH("mx.max_rdma_windows", &mx_max_rdma_windows);
    TUNABLE_INT_FETCH("mx.intr_coal_delay", &mx_intr_coal_delay);
    TUNABLE_INT_FETCH("mx.override_e_to_f", &mx_override_e_to_f);
    mtx_init(&mx_print_mtx, "mx print mutex", NULL, MTX_DEF);
    status = mx_init_driver();
    if (status != 0) {
      device_printf(dev, "could not init mx\n");
      error = ENXIO;
      goto abort_with_nothing;
    }
    initialized_mx = 1;

    /* determine how much memory can be pinned */

    mx_max_user_pinned_pages = 2 * vm_page_max_wired;
    MX_INFO (("MX: mx_register_memory will be able to pin "
	      "%d pages (%d MBytes)\n", mx_max_user_pinned_pages,
	      (mx_max_user_pinned_pages * PAGE_SIZE) / (1024 * 1024)));

    status = kthread_create(mx_watchdog_thread, 0, &watchdog_proc,
			    0, 0, "mx watchdog thread");
    if (status) {
      MX_WARN(("Failed to create watchdog thread\n"));
      mx_watchdog_pid = -1;
    } else {
      mx_watchdog_pid = watchdog_proc->p_pid;
    }
    clone_setup(&mx_clones);
    event_tag = EVENTHANDLER_REGISTER(dev_clone, mx_clone, 0, 1000);
    if (!event_tag)
      clone_cleanup(&mx_clones);
  }
  
  /*
   * Map control/status registers.
   */

  rid = PCIR_BARS;
  is->arch.mem = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
				    0, ~0, 1, RF_ACTIVE|PCI_RF_DENSE);
  if (!is->arch.mem) {
    device_printf(dev, "could not map memory\n");
    error = ENXIO;
    goto abort_with_mx_init;
  }

  /* 
   *  map the device into the kernel virtual address space.  FreeBSD
   *  would rather we used bus_space access, but the MI portions of MX
   *  don't support that. 
   */
  is->arch.csr = rman_get_virtual(is->arch.mem);
  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("MX: board mapped at KVA: %p\n", is->arch.csr));

  mx_set_writecombine(rman_get_start(is->arch.mem),
		      rman_get_size(is->arch.mem));
  /*
   * Allocate our interrupt.
   */
  rid = 0;
  is->arch.irq = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0, 1,
				    RF_SHAREABLE | RF_ACTIVE);
  if (is->arch.irq == NULL) {
    device_printf(dev, "could not map interrupt\n");
    goto abort_with_mem;
  }
  
  error = bus_setup_intr(dev, is->arch.irq, 
			 INTR_TYPE_MISC | INTR_MPSAFE,
			 mx_intr, is, &is->arch.ih);
  if (error) {
    device_printf(dev, "could not setup irq\n");
    goto abort_with_intr;
  }
  /*
   *  Some MX setup
   */
  
  if (mx_instance_init(is, unit) != 0) {
    MX_NOTE (("mx_instance_init failed\n"));
    goto abort_with_irq;
  }

#if MX_KERNEL_LIB
  /* initialize the kernel library */
  error = mx_init_klib();
  if (error) {
    mx_instance_finalize(is);
    goto abort_with_irq;
  }
#endif

  mx_mutex_exit(&is->sync);
  return (0);


 abort_with_irq:
  bus_release_resource(dev, SYS_RES_IRQ, 0, is->arch.irq);

 abort_with_intr:
  bus_teardown_intr(dev, is->arch.irq, is->arch.ih);
  is->arch.irq = NULL;

 abort_with_mem:
  bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, is->arch.mem);

 abort_with_mx_init:
  if (initialized_mx) {
    mtx_destroy(&mx_print_mtx);
    mx_initialized = 0;
    mx_watchdog_stop();
    if (event_tag) {
      EVENTHANDLER_DEREGISTER(dev_clone, event_tag);
      clone_cleanup(&mx_clones);
    }
    event_tag = NULL;
    mx_finalize_driver();
  }
 abort_with_nothing:
  return error;
}

static int
mx_shutdown(device_t dev)
{
  printf("mx_shutdown called\n");
  return (0);
}

int
mx_start_mapper(mx_instance_state_t *is)
{
  char devname[16];

  snprintf(devname, sizeof(devname), "mx%d", is->id);

  devctl_notify("mx", "start_mapper", "dontcare", devname);
  return 0;
}

int
mx_stop_mapper(mx_instance_state_t *is)
{
  pid_t pid;
  struct proc *p;
  int count = 16;
  int old_use_count;

  old_use_count =   mx_atomic_read(&mx_use_count);
  mx_mutex_enter(&is->sync);
  if (is->raw.es == NULL) {
    mx_mutex_exit(&is->sync);
    return 0;
  }
  pid = is->raw.es->opener.pid;
  mx_mutex_exit(&is->sync);

  if ((p = pfind(pid)) == NULL) {
    if ((p = zpfind(pid)) == NULL)
      return 0;
  }
  psignal(p, SIGKILL);
  PROC_UNLOCK(p);
  do {
    tsleep(p, PZERO, "killmapper", hz/8);
    count--;
  } while (is->raw.es != NULL && count != 0);

  do {
    tsleep(p, PZERO, "killmapper", hz/8);
    count--;
  } while (count != 0 && old_use_count == mx_atomic_read(&mx_use_count));

  return 0;
}


static int
mx_detach(device_t dev)
{
  mx_instance_state_t *is = device_get_softc(dev);
  int status;

  /* if we failed to attach, then don't try to tear
     any state down */
  if (is->arch.irq == NULL)
    return 0;

  mx_stop_mapper(is);

  if (is->ref_count) {
    printf("mx%d: %d refs outstanding, unload denied\n",
	   is->id, is->ref_count);
    devctl_notify("mx", "unload", "unload", "unload");
    return EBUSY;
  }

  if (is->arch.irq == NULL) {
    /* this means that the is was never fully setup, so no teardown
       is required */
    return (0);
  }

  /* if any process is using the module, abort the unload */
  if (mx_atomic_read(&mx_use_count) != 0){
    printf("mx%d: use_count = %d unload denied\n",
	   is->id, mx_use_count);
    devctl_notify("mx", "unload", "unload", "unload");
    return EBUSY;
  }

#if MX_KERNEL_LIB
  mx_finalize_klib();
#endif

  /* finalize instance */
  if (is->board_ops.disable_interrupt != NULL)
    is->board_ops.disable_interrupt(is);
  bus_teardown_intr(dev, is->arch.irq, is->arch.ih);
  bus_release_resource(dev, SYS_RES_IRQ, 0, is->arch.irq);

  status = mx_instance_finalize(is);
  if (status)
    return (status);

  if (mx_num_instances == 0) {
    mx_watchdog_stop();
    if (event_tag)
      EVENTHANDLER_DEREGISTER(dev_clone, event_tag);
    mx_finalize_driver();
    mtx_destroy(&mx_print_mtx);
    clone_cleanup(&mx_clones);
  }

  /*
   * Deallocate resources.
   */
  bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, is->arch.mem);

  return(0);
}

/* this function is called by devfs when somebody attempts
   to open /dev/mx%d, and that device entry does not exist */

static void
mx_clone(void *arg, 
#if MX_FREEBSD_CLONE_TAKES_UCRED
	 struct ucred *uc,
#endif
	 char *name, int namelen, struct cdev **cdev)
{
  unsigned int u;
  int i, privileged, mode, fake_unit, free_cdev, ctl;

  if (*cdev != NULL) {
    /*    printf("mx_clone called with non-null struct cdev *??\n");*/
    return;
  }

  ctl = 0;
  if (namelen ==  6 && (strncmp(name, "mxctlp", namelen) == 0)) {
    privileged = 1;
    ctl = 1;
  } else if (namelen ==  5 && (strncmp(name, "mxctl", namelen) == 0)) {
    privileged = 0;
    ctl = 1;
  } else if (dev_stdclone(name, NULL, "mxp", &u)) {
    privileged = 1;
  } else if (dev_stdclone(name, NULL, "mx", &u)) {
    privileged = 0;
  } else {/* Don't recognize the name */
    return;
  }

  if (privileged)
    mode = 0600;
  else
    mode = 0666;

  if (ctl)
    u = (unsigned int)-1;
  else {
    if (u >= mx_num_instances)
      return; /* unit too large */
  }

  if (privileged && suser(curthread))
    return; /* EPERM */


#if __FreeBSD_version < 600000
  /* Now we iterate over our clone list.  We start at index 0, and
     keep going until we find a free clone.  We know the clone is free
     because either the cdev is null (in which case it was never
     allocated, and no /dev/mx_fake.%d entry exists) or the cdev is
     non-null, and its si_drv1 field is null (which means that it has
     been closed by another process, and a /dev/mx_fake.%d exists).

     Its important to find priviliged devices, so we always search
     only odd units when we want a priviliged device.
  */
     
  fake_unit = 0 + privileged;

  do {
    i = clone_create(&mx_clones, &mx_cdevsw, &fake_unit, cdev, 0);
    free_cdev = i || ((*cdev)->si_drv1 == NULL);
#if 0
    printf("dev: %d. i: %d", fake_unit, i);
    if (i == 0)
      printf(" drv1: %p", (*cdev)->si_drv1);
    else
      printf(" drv1: NULL");
    printf(" Free = %d\n", free_cdev);
#endif
    if (!free_cdev)
      fake_unit+=2;
  } while (!free_cdev);
#else
  free_cdev = 1; /* -Wunused */
  fake_unit = -1;
  i = clone_create(&mx_clones, &mx_cdevsw, &fake_unit, cdev, 0);
#endif

  if (i) {
    /* need to allocate a new /dev/mx_fake.%d device node */
    *cdev = make_dev(&mx_cdevsw, unit2minor(fake_unit), 
		    UID_ROOT, GID_WHEEL, 
		    mode, "mx_fake.%d", fake_unit);
  }
  if (*cdev != NULL) {
    /* Treat si_drv1 like a bitfield.  Low bit is "in use" flag,
       second bit is privileged bit, remainder is the real unit that 
       the opener requested */
    mx_always_assert((*cdev)->si_drv1 == NULL);
    (*cdev)->si_drv1 = (void *)(uintptr_t)(1 | (privileged << 1) | (u << 2));
    (*cdev)->si_drv2 = NULL;
  }
}


static int
mx_close(struct cdev *rdev, int flags, int devtype, struct thread *td)
{
  mx_endpt_state_t *es;

  es = rdev->si_drv2;
  rdev->si_drv2 = NULL;

  if (es) {
      mx_common_close(es);
      mx_kfree(es);
  }

  /* Mark the clone as free */
  rdev->si_drv1 = NULL;

  MOD_DEC_USE_COUNT;
  return (0);
}



static int
mx_set_endpoint(struct cdev *rdev, u_long cmd, const mx_uaddr_t arg, int raw)
{
  int32_t unit, status;
  mx_set_endpt_t set_endpt;
  mx_endpt_state_t *es;
  size_t len;

  if (cmd != MX_SET_ENDPOINT && cmd != MX_SET_RAW)
    return EINVAL;
  /* unit bounds checking was done in mx_clone, and will be 
     done again in mx_common_open */

  unit = (uintptr_t)rdev->si_drv1 >> 2;
  
  if (!raw) {
    mx_arch_copyin(arg, &set_endpt, sizeof(set_endpt));
    if (set_endpt.endpoint < 0 || set_endpt.endpoint >= mx_max_endpoints)
      return EINVAL;
  }
  
  es = mx_kmalloc(sizeof(*es), MX_MZERO|MX_WAITOK);
  if (es == 0)
    return ENOMEM;

  es->privileged = (uintptr_t)rdev->si_drv1 & 2;
  es->is_kernel = 0;
  es->opener.pid = mx_kgetpid();
  es->arch.proc = curproc;
  status = mx_common_open(unit, set_endpt.endpoint, es, raw);
  if (sizeof(curproc->p_comm) > sizeof(es->opener.comm))
    len = sizeof(es->opener.comm);
  else
    len = sizeof(curproc->p_comm);
  bcopy(curproc->p_comm, es->opener.comm, len);

  if (status != 0) {
    mx_kfree(es);
    return (status);
  }
  set_endpt.session_id = es->session_id;
  if (!raw)
    status = mx_arch_copyout(&set_endpt, arg, sizeof(set_endpt));

  rdev->si_drv2 = es;

  MX_DEBUG_PRINT (MX_DEBUG_OPENCLOSE,
		  ("Board %d, endpoint %d opened\n", 
		   unit, set_endpt.endpoint));

  return status;
}



int 
mx_ioctl(struct cdev *rdev, u_long cmd, caddr_t data,
	 int fflag, struct thread *td)
{
  int retval;
  int privileged;
  mx_endpt_state_t *es;
  mx_uaddr_t arg;

  es = (mx_endpt_state_t *)rdev->si_drv2;

  if (data)
    arg = (mx_uaddr_t)*(caddr_t *)data;
  else
    arg = 0;

  privileged = (uintptr_t)rdev->si_drv1 & 2; 

  /* Some ioctls do not require a valid endpoint state */
  if (es == NULL) {
    switch (cmd) {
    case MX_SET_ENDPOINT:
      retval = mx_set_endpoint(rdev, cmd, arg, 0);
      break;
    case MX_SET_RAW:
      retval = mx_set_endpoint(rdev, cmd, arg, 1);
      break;
    default:
      retval = mx_endptless_ioctl(cmd, arg, privileged, 0);
    }
    goto done;
  }

  /* others do.. */

  mx_mutex_enter(&es->sync);
  es->ref_count++;
  mx_mutex_exit(&es->sync);  

  retval = mx_common_ioctl(es, cmd, arg);
  if (retval == ENOTTY) {
    retval = mx_endptless_ioctl(cmd, arg, privileged, 0);
  }

  mx_mutex_enter(&es->sync);
  es->ref_count--;
  mx_mutex_exit(&es->sync);  

 done:
  return retval;
}

static int
mx_open(struct cdev *cdev, int flags, int devtype, struct thread *td)
{
  /* block attempt to directly open a free clone in
     /dev/mx_fake.%d w/o going through mx_clone */

  if (cdev->si_drv1 == NULL)
    return ENXIO;

  /* block attempt to directly open a busy clone in
     /dev/mx_fake.%d w/o going through mx_clone */

  if (cdev->si_drv2 != NULL)
    return EBUSY;

  MOD_INC_USE_COUNT;
  return (0);
}

static int
mx_is_iomem(mx_instance_state_t *is, void *kva)
{
  u_long devmem_start = (unsigned long)rman_get_virtual(is->arch.mem);
  u_long devmem_end = devmem_start + rman_get_size(is->arch.mem);

  if (((u_long)kva >= devmem_start) && ((u_long)kva <= devmem_end)) {
    return 1;
  }
  return 0;
}

/*
 * MMAP:  FreeBSD maps each page on demand.  This is similar
 * to what linux calls the "nopage" method. Ie, this function
 * is called once (per thread) for each page, as it is touched 
 */

static int
mx_mmap(struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr, int nprot)
{
  int status;
  mx_endpt_state_t *es;
  void *kva;
  int mem_type;
  mx_page_pin_t *dontcare;

  es = (mx_endpt_state_t *)dev->si_drv2;

  if (es == NULL)
    return EINVAL;

  mx_mutex_enter(&es->sync);
  status = mx_mmap_off_to_kva(es, offset, &kva, &mem_type, &dontcare);
  if (status != 0) {
    MX_DEBUG_PRINT 
      (MX_DEBUG_KVA_TO_PHYS,
       ("status = %d, offset = 0x%lx\n", status, (long)offset));
    goto abort_with_mutex;
  }  
  
  if (MX_CPU_alpha && mx_is_iomem(es->is, kva)) {
    /* alphas are wierd, and want the virtual (K0SEG) address
       returned from mmap */
    *paddr = (vm_offset_t)kva;
  } else {
    *paddr = vtophys(kva);
  }

 abort_with_mutex:
  mx_mutex_exit(&es->sync);

  return(status);

}

static int
mx_read(struct cdev *rdev, struct uio *uio, int flags __unused)
{
  int len;
  int resid;
  int status = 0;
  unsigned int unit = (uintptr_t)rdev->si_drv1 >> 2;
  off_t offset;
  char *c;

  if (unit >= mx_max_instance)
    return EINVAL;

  status = mx_instance_status_string(unit, &c, &len);
  if (status)
    return status;

  resid = uio->uio_resid;
  offset = uio->uio_offset;
  if (offset > strlen(c))
      goto abort_with_c;
  status = uiomove(c+offset, MIN((len - offset), uio->uio_resid), uio);

 abort_with_c:
  mx_kfree(c);
  return status;
}

static void
mx_intr(void *arg)
{
  mx_instance_state_t *is;

  is = (mx_instance_state_t *)arg;
  mx_common_interrupt(is);
}

void 
mx_assertion_failed (const char *assertion, int line, const char *file)
{
  printf("MX: assertion: <<%s>>  failed at line %d, file %s\n",
	 assertion, line, file);

  kdb_enter("mx assertion failed");
}

int
mx_rand(void)
{
  int val = (int)arc4random();
  return val;
}


/* memory allocation and deallocation */

MALLOC_DEFINE(M_MXBUF, "mx buf", "Buffers used by Myricom MX driver");

void *
mx_kmalloc(size_t len, uint32_t flags)
{
  void *retval;

  if ((flags & MX_NOWAIT) == 0)
    flags |= MX_WAITOK;
  retval = malloc(len, M_MXBUF, flags);

  return (retval);
}

void
mx_kfree(void *ptr)
{
  free(ptr, M_MXBUF);
}

int
mx_alloc_dma_page(mx_instance_state_t *is, char **alloc_addr, 
		  char **addr, mx_page_pin_t *pin)
{
  int status;

  *alloc_addr = *addr = mx_kmalloc(PAGE_SIZE, MX_WAITOK);
  if (!*addr)
    return ENOMEM;
  mx_assert(((uintptr_t)*addr & MX_VPAGE_MASK) == (uintptr_t)*addr);

  pin->va = (uint64_t)(uintptr_t)*addr;
  status = mx_pin_page(is, pin, MX_PIN_KERNEL | MX_PIN_CONSISTENT, 0);
  if (status) {
    mx_kfree(*addr);
  }
  return 0;
}

void
mx_free_dma_page(mx_instance_state_t *is, char **alloc_addr, mx_page_pin_t *pin)
{
  mx_unpin_page(is, pin, MX_PIN_KERNEL | MX_PIN_CONSISTENT);
  mx_kfree(*alloc_addr);
  if (MX_DEBUG)
    *alloc_addr = 0;
}


void *
mx_map_io_space(mx_instance_state_t * is, uint32_t offset, uint32_t len)
{
  return ((void *) ((vm_offset_t) is->arch.csr + (vm_offset_t) offset));
}

void
mx_unmap_io_space (mx_instance_state_t * is,
                        uint32_t len, void *kaddr)
{
}

/* synchronization */

void
mx_spin_lock_init(mx_spinlock_t *s, mx_instance_state_t *is, 
		  int endpoint, char *string)
{
  mtx_init(s, string, NULL, MTX_SPIN);
}

void
mx_spin_lock_destroy(mx_spinlock_t *s)
{
  mtx_destroy(s);
}

void
mx_sync_init(mx_sync_t *s, mx_instance_state_t *is, int endpoint, char *string)
{
  bzero(s, sizeof(*s));
  int id;

  /* FreeBSD uses the name of the mutex to do runtime checking for
     locking bugs, such as order reversals, sleeping while holding a
     lock, etc.  So we must give each mutex a unique string */

  if (is)
    id = is->id;
  else
    id = -1;

  snprintf(&s->mtx_string[0], MX_MTX_STRLEN, "mx(%d,%d): %s",
	   id, endpoint, string);

  snprintf(&s->sleep_mtx_string[0], MX_MTX_STRLEN, "mxsleep(%d,%d): %s",
	   id, endpoint, string);

  mtx_init(&s->mtx, &s->mtx_string[0], NULL, MTX_DEF);
  mtx_init(&s->sleep_mtx, &s->sleep_mtx_string[0], NULL, MTX_DEF);
  cv_init(&s->cv, "mx cv");
}

void
mx_sync_destroy(mx_sync_t *s)
{
  mtx_destroy(&s->mtx);
  mtx_destroy(&s->sleep_mtx);
  cv_destroy(&s->cv);
  bzero(s, sizeof(*s));
}

void
mx_sync_reset(mx_sync_t *s)
{
  mtx_lock(&s->sleep_mtx);
  s->wake_cnt = 0;
  mtx_unlock(&s->sleep_mtx);
}
void
mx_wake(mx_sync_t * s)
{
  mtx_lock(&s->sleep_mtx);
  s->wake_cnt++;
  cv_signal(&s->cv);
  mtx_unlock(&s->sleep_mtx);
}

int
mx_sleep(mx_sync_t *s, int ms, int flags)
{
  int ret = 0;
  int timo;
  
  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_sleep() on chan. %p, %d ms\n", 
				   s, ms));

  timo = (ms * hz) / 1000;
  mtx_lock(&s->sleep_mtx);

  while (ret == 0 && s->wake_cnt <= 0) {
    if (flags & MX_SLEEP_INTR) {
      if (ms != MX_MAX_WAIT)
	ret = cv_timedwait_sig(&s->cv, &s->sleep_mtx, timo);
      else
	ret = cv_wait_sig(&s->cv, &s->sleep_mtx);
    }
    else {
      if (ms != MX_MAX_WAIT)
	ret = cv_timedwait(&s->cv, &s->sleep_mtx, timo);
      else
	cv_wait(&s->cv, &s->sleep_mtx);
    }
  }

  if (ret == 0)
    s->wake_cnt--;
  mtx_unlock(&s->sleep_mtx);

  MX_DEBUG_PRINT (MX_DEBUG_SLEEP,
	    ("mx_sleep() returning %d.\n", ret));
  
  return ret;
}

/* pining / unpining pages */

int
mx_pin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags, uint64_t memory_context)
{
  vm_paddr_t pa;
  caddr_t addr;
  int status;
  struct pmap *pmap = &curproc->p_vmspace->vm_pmap;
  struct vm_page *m;

  addr = (caddr_t)(vm_offset_t)pin->va;

  if (flags & MX_PIN_PHYSICAL) {
    pin->dma.low = MX_LOWPART_TO_U32(pin->va);
    pin->dma.high = MX_HIGHPART_TO_U32(pin->va);
    return 0;
  }

  if (flags & MX_PIN_KERNEL) {
    /* kernel address are always pinned in FreeBSD, just
       get the DMA address */
    pa = vtophys(addr);
    pin->dma.low = MX_LOWPART_TO_U32(pa);
    pin->dma.high = MX_HIGHPART_TO_U32(pa);
    return 0;
  } 

  if (mx_atomic_read(&mx_max_user_pinned_pages) <= 0)
    return ENOMEM;

  /* fault in page if needed */
 retry:
  status = vm_fault_quick(addr, VM_PROT_READ|VM_PROT_WRITE);

  /* vm_fault_quick returns -1 on failure */
  if (status < 0) {
    MX_WARN(("vm_fault_quick(%p,..) failed\n", addr));
    return EPERM;
  }
  
  pa = pmap_extract(pmap, (vm_offset_t)addr);
  if (pa == 0) {
    MX_WARN(("mx_pin_page: type 1 race against user address, retrying\n"));
    goto retry;
  }
  m = PHYS_TO_VM_PAGE(pa);
  vm_page_lock_queues();
  vm_page_wire(m);
  vm_page_unlock_queues();
  pin->page = m;
  pin->dma.low = MX_LOWPART_TO_U32(pa);
  pin->dma.high = MX_HIGHPART_TO_U32(pa);
  mx_atomic_subtract(1, &mx_max_user_pinned_pages);
  
  return (0);
}

void
mx_unpin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags)
{
  struct vm_page *m;

  if (flags & (MX_PIN_KERNEL | MX_PIN_PHYSICAL)) {
    return;
  }
  m = pin->page;
  vm_page_lock_queues();
  vm_page_unwire(m, 0);
  /* if the page no longer has an object, that means we
     have the last ref. to it and we need to free it */
  if (m->wire_count == 0 && m->object == NULL)
    vm_page_free(m);
  vm_page_unlock_queues();
  mx_atomic_add(1, &mx_max_user_pinned_pages);
}

/* Copy data from any process user space to kernel space */
static int
mx_arch_copy_from_map(pmap_t pmap, struct vm_map *vm_map, int sf_buf_flags, mx_uaddr_t usrc,
		      char *kdst,
		      uint32_t length)
{
  vm_offset_t offset, len;
  vm_page_t page;
  struct sf_buf *sf;
  char *kva;
  int status = 0;
  int race_cnt = 0;

  /* Walk the addresses, copying page by page. */

  while (length != 0) {
  race:
    /* atomically do uva-> page translation, wiring the page */
    page = pmap_extract_and_hold(pmap, usrc, VM_PROT_READ);
    if (page == NULL) {
      /* page was not present, attempt to fault it in */
      status = vm_fault(vm_map, usrc, VM_PROT_READ, 0);
      if (status != 0) {
	MX_WARN(("mx_arch_copy_from_map src page = NULL for addr 0x%lx\n", (long)usrc));
	return status;
      }
      /* make sure we don't go into an infinate loop, should not be
	 needed */
      race_cnt++;
      if (race_cnt > 50) {
	MX_WARN(("mx_arch_copy_from_map: too many races for 0x%lx\n", (long)usrc));
	return EFAULT;
      }
      goto race;
    }
    race_cnt = 0;
    
    /* map the page into the kernel */
    sf = sf_buf_alloc(page, sf_buf_flags);
    if (sf == NULL) {
      vm_page_unhold(page);
      return ENOMEM;
    }

    /* Find the next page boundary */
    offset = usrc & (PAGE_SIZE-1);
    len = PAGE_SIZE - offset;

    if (len > length)
      len = length;

    /* add the offsets to compute the kernel virtual addresses */
    kva = (char *)sf_buf_kva(sf) + offset;

    /* finally do the copy and advance the pointers */
    bcopy(kva, kdst, len);

    usrc += len;
    kdst += len;
    length -= len;

    /* release the mappings and let go of the page */
    sf_buf_free(sf);
    vm_page_unhold(page);
  }

  return 0;
}

struct direct_get_callback_param {
  pmap_t pmap;
  struct vm_map * vm_map;
  int sf_buf_flags;
};

/* OS specific callback for direct get, copying from another process
 * user-space to current process user-space.
 */
int
mx_arch_copy_user_to_user(mx_uaddr_t udst,
			  mx_uaddr_t usrc, void * src_space,
			  uint32_t length)
{
  struct direct_get_callback_param * param = (struct direct_get_callback_param *) src_space;
  pmap_t pmap = param->pmap;
  struct vm_map *vm_map = param->vm_map;
  int sf_buf_flags = param->sf_buf_flags;
  vm_offset_t offset, len;
  vm_page_t page;
  struct sf_buf *sf;
  char *kva;
  int status = 0;
  int race_cnt = 0;

  /* Walk the addresses, copying page by page. */

  while (length != 0) {
  race:
    /* atomically do uva-> page translation, wiring the page */
    page = pmap_extract_and_hold(pmap, usrc, VM_PROT_READ);
    if (page == NULL) {
      /* page was not present, attempt to fault it in */
      status = vm_fault(vm_map, usrc, VM_PROT_READ, 0);
      if (status != 0) {
	MX_WARN(("mx_arch_copy_user_to_user src page = NULL for addr 0x%lx\n", (long)usrc));
	return status;
      }
      /* make sure we don't go into an infinate loop, should not be
	 needed */
      race_cnt++;
      if (race_cnt > 50) {
	MX_WARN(("mx_arch_copy_user_to_user: too many races for 0x%lx\n", (long)usrc));
	return EFAULT;
      }
      goto race;
    }
    race_cnt = 0;
    
    /* map the page into the kernel */
    sf = sf_buf_alloc(page, sf_buf_flags);
    if (sf == NULL) {
      vm_page_unhold(page);
      return ENOMEM;
    }

    /* Find the next page boundary */
    offset = usrc & (PAGE_SIZE-1);
    len = PAGE_SIZE - offset;

    if (len > length)
      len = length;

    /* add the offsets to compute the kernel virtual addresses */
    kva = (char *)sf_buf_kva(sf) + offset;

    /* finally do the copy and advance the pointers */
    copyout(kva, (void *) udst, len);

    usrc += len;
    udst += len;
    length -= len;

    /* release the mappings and let go of the page */
    sf_buf_free(sf);
    vm_page_unhold(page);
  }

  return 0;
}

int
mx_direct_get(mx_endpt_state_t *dst_es, mx_shm_seg_t *dst_segs, uint32_t dst_nsegs,
	      mx_endpt_state_t *src_es, mx_shm_seg_t *src_segs, uint32_t src_nsegs,
	      uint32_t length)
{
  int status;
  struct direct_get_callback_param param;

  /* make sure we can find the src pmap, might
     be overly conservative */
  if (src_es->arch.proc == NULL ||
      src_es->arch.proc->p_vmspace == NULL) {
    status = ENXIO;
    MX_WARN(("No vmspace in mx_direct_get?!\n"));
    goto abort_with_nothing;
  }

  /* get destination segments from current process */
  if (dst_nsegs > 1) {
    mx_uaddr_t uptr = dst_segs[0].vaddr;
    dst_segs = mx_kmalloc(dst_nsegs * sizeof(*dst_segs), 0);
    if (!dst_segs) {
      status = ENOMEM;
      goto abort_with_nothing;
    }
    status = copyin((void*) uptr, dst_segs, dst_nsegs * sizeof(*dst_segs));
    if (status) {
      goto abort_with_dst_segs;
    }
  }

#ifdef SFB_CPUPRIVATE
  /* It is cheaper to make the phys -> kernel mapping private to this
     cpu, but we must pin the current thread to this cpu to do this */
  param.sf_buf_flags = SFB_CPUPRIVATE;
  sched_pin();
#else
  param.sf_buf_flags = 0;
#endif

  param.pmap = &src_es->arch.proc->p_vmspace->vm_pmap;
  param.vm_map = &src_es->arch.proc->p_vmspace->vm_map;

  /* get destination segments from current process */
  if (src_nsegs > 1) {
    mx_uaddr_t uptr = src_segs[0].vaddr;
    src_segs = mx_kmalloc(src_nsegs * sizeof(*src_segs), 0);
    if (!src_segs) {
      status = ENOMEM;
      goto abort_with_cpu_pin;
    }
    status = mx_arch_copy_from_map(param.pmap, param.vm_map, param.sf_buf_flags,
				   uptr, (char *) src_segs, src_nsegs * sizeof(*src_segs));
    if (status) {
      goto abort_with_src_segs;
    }
  }

  status = mx_direct_get_common(dst_segs, dst_nsegs,
				&param, src_segs, src_nsegs,
				length);

 abort_with_src_segs:
  if (src_nsegs > 1)
    mx_kfree (src_segs);
 abort_with_cpu_pin:
#ifdef SFB_CPUPRIVATE  
  sched_unpin();
#endif
 abort_with_dst_segs:
  if (dst_nsegs > 1)
    mx_kfree (dst_segs);
 abort_with_nothing:
  return status;
}

/****************************************************************
 * PCI config space functions
 ****************************************************************/
#define pcibios_to_mx_read(size)                                           \
int                                                                        \
mx_read_pci_config_##size (mx_instance_state_t *is,                        \
			   uint32_t offset, uint##size##_t *value)         \
{                                                                          \
  *value =  (uint##size##_t) pci_read_config(is->arch.dev, offset, size/8);\
  return (0);                                                              \
}
pcibios_to_mx_read(32)
pcibios_to_mx_read(16)
pcibios_to_mx_read(8)

#define pcibios_to_mx_write(size)                                          \
int                                                                        \
mx_write_pci_config_##size (mx_instance_state_t *is,                       \
			    uint32_t offset, uint##size##_t value)         \
{                                                                          \
  pci_write_config(is->arch.dev, offset, (u_int32_t)value, size/8);        \
  return (0);                                                              \
}
pcibios_to_mx_write(32)
pcibios_to_mx_write(16)
pcibios_to_mx_write(8)

void
mx_set_default_hostname(void)
{
  strlcpy(mx_default_hostname, hostname, sizeof(mx_default_hostname));
}
